Skip to content

Commit 850f43d

Browse files
committed
Synchronize with llama.cpp upstream
AMD GPU support on Windows is still in a broken state.
1 parent b52304e commit 850f43d

21 files changed

+2370
-2310
lines changed

llama.cpp/README.llamafile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ LICENSE
99
ORIGIN
1010

1111
https://github.com/ggerganov/llama.cpp/pull/4406/
12-
67fac4b95fcccfda8ab965e9ba4992a9ddf3a25f
13-
2024-04-10
12+
b8109bc0139f15a5b321909f47510b89dca47ffc
13+
2024-04-21
1414

1515
LOCAL MODIFICATIONS
1616

llama.cpp/common.cpp

Lines changed: 92 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-
22
// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
3+
34
#include "common.h"
5+
#include "json.h"
6+
#include "json-schema-to-grammar.h"
47
#include "llama.h"
5-
#include "ggml-cuda.h"
6-
#include "ggml-metal.h"
78

89
#include <algorithm>
910
#include <cassert>
1011
#include <cmath>
11-
#include <cerrno>
1212
#include <cstring>
13-
#include <climits>
1413
#include <ctime>
1514
#include <fstream>
1615
#include <iterator>
@@ -74,6 +73,8 @@
7473
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
7574
#endif // LLAMA_USE_CURL
7675

76+
using json = nlohmann::ordered_json;
77+
7778
int32_t get_num_physical_cores() {
7879
#ifdef __linux__
7980
// enumerate the set of thread siblings, num entries is num cores
@@ -110,6 +111,79 @@ int32_t get_num_physical_cores() {
110111
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
111112
}
112113

114+
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
115+
#include <pthread.h>
116+
117+
static void cpuid(unsigned leaf, unsigned subleaf,
118+
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
119+
__asm__("movq\t%%rbx,%%rsi\n\t"
120+
"cpuid\n\t"
121+
"xchgq\t%%rbx,%%rsi"
122+
: "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
123+
: "0"(leaf), "2"(subleaf));
124+
}
125+
126+
static int pin_cpu(int cpu) {
127+
cpu_set_t mask;
128+
CPU_ZERO(&mask);
129+
CPU_SET(cpu, &mask);
130+
return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
131+
}
132+
133+
static bool is_hybrid_cpu(void) {
134+
unsigned eax, ebx, ecx, edx;
135+
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
136+
return !!(edx & (1u << 15));
137+
}
138+
139+
static bool is_running_on_efficiency_core(void) {
140+
unsigned eax, ebx, ecx, edx;
141+
cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
142+
int intel_atom = 0x20;
143+
int core_type = (eax & 0xff000000u) >> 24;
144+
return core_type == intel_atom;
145+
}
146+
147+
static int count_math_cpus(int cpu_count) {
148+
int result = 0;
149+
for (int cpu = 0; cpu < cpu_count; ++cpu) {
150+
if (pin_cpu(cpu)) {
151+
return -1;
152+
}
153+
if (is_running_on_efficiency_core()) {
154+
continue; // efficiency cores harm lockstep threading
155+
}
156+
++cpu; // hyperthreading isn't useful for linear algebra
157+
++result;
158+
}
159+
return result;
160+
}
161+
162+
#endif // __x86_64__ && __linux__
163+
164+
/**
165+
* Returns number of CPUs on system that are useful for math.
166+
*/
167+
int get_math_cpu_count() {
168+
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
169+
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
170+
if (cpu_count < 1) {
171+
return get_num_physical_cores();
172+
}
173+
if (is_hybrid_cpu()) {
174+
cpu_set_t affinity;
175+
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
176+
int result = count_math_cpus(cpu_count);
177+
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
178+
if (result > 0) {
179+
return result;
180+
}
181+
}
182+
}
183+
#endif
184+
return get_num_physical_cores();
185+
}
186+
113187
void process_escapes(std::string & input) {
114188
std::size_t input_len = input.length();
115189
std::size_t output_idx = 0;
@@ -1167,6 +1241,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
11671241
);
11681242
return true;
11691243
}
1244+
if (arg == "-j" || arg == "--json-schema") {
1245+
if (++i >= argc) {
1246+
invalid_param = true;
1247+
return true;
1248+
}
1249+
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
1250+
return true;
1251+
}
11701252
if (arg == "--override-kv") {
11711253
if (++i >= argc) {
11721254
invalid_param = true;
@@ -1374,6 +1456,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
13741456
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
13751457
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
13761458
printf(" --grammar-file FNAME file to read grammar from\n");
1459+
printf(" -j SCHEMA, --json-schema SCHEMA\n");
1460+
printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
1461+
printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
13771462
printf(" --cfg-negative-prompt PROMPT\n");
13781463
printf(" negative prompt to use for guidance. (default: empty)\n");
13791464
printf(" --cfg-negative-prompt-file FNAME\n");
@@ -1766,6 +1851,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
17661851
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
17671852
cparams.pooling_type = params.pooling_type;
17681853
cparams.defrag_thold = params.defrag_thold;
1854+
cparams.cb_eval = params.cb_eval;
1855+
cparams.cb_eval_user_data = params.cb_eval_user_data;
17691856
cparams.offload_kqv = !params.no_kv_offload;
17701857

17711858
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -2213,7 +2300,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
22132300
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
22142301
}
22152302

2216-
{
2303+
if (params.warmup) {
22172304
LOG("warming up the model with an empty run\n");
22182305

22192306
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };

llama.cpp/common.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#pragma once
77

8+
#include "llamafile/log.h"
89
#include "llama.h"
910

1011
#include "sampling.h"
@@ -43,6 +44,7 @@ extern char const *LLAMA_BUILD_TARGET;
4344

4445
struct llama_control_vector_load_info;
4546

47+
int get_math_cpu_count();
4648
int32_t get_num_physical_cores();
4749

4850
//
@@ -52,7 +54,7 @@ int32_t get_num_physical_cores();
5254
struct gpt_params {
5355
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
5456

55-
int32_t n_threads = llamafile_get_math_cpu_count();
57+
int32_t n_threads = get_math_cpu_count();
5658
int32_t n_threads_draft = -1;
5759
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
5860
int32_t n_threads_batch_draft = -1;
@@ -84,6 +86,9 @@ struct gpt_params {
8486
int32_t yarn_orig_ctx = 0; // YaRN original context length
8587
float defrag_thold = -1.0f; // KV cache defragmentation threshold
8688

89+
ggml_backend_sched_eval_callback cb_eval = nullptr;
90+
void * cb_eval_user_data = nullptr;
91+
8792
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
8893

8994
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
@@ -160,6 +165,7 @@ struct gpt_params {
160165
bool infill = false; // use infill mode
161166
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
162167
bool no_kv_offload = false; // disable KV offloading
168+
bool warmup = true; // warmup run
163169

164170
std::string cache_type_k = "f16"; // KV cache data type for the K
165171
std::string cache_type_v = "f16"; // KV cache data type for the V

llama.cpp/ggml-backend.c

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,22 @@
22
// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
33

44
#include "ggml-backend-impl.h"
5+
#include "ggml-alloc.h"
6+
#include "ggml-impl.h"
57

68
#include <assert.h>
79
#include <limits.h>
810
#include <stdarg.h>
911
#include <stdio.h>
1012
#include <stdlib.h>
1113
#include <string.h>
12-
#include <stdlib.h>
13-
14-
#include "ggml-alloc.h"
15-
#include "ggml-cuda.h"
16-
#include "ggml-impl.h"
17-
#include "ggml-metal.h"
18-
#include "llamafile/log.h"
1914

2015
#ifndef NDEBUG
2116
#define NDEBUG // [jart] delete printf debugging
2217
#endif
2318

24-
2519
#define MAX(a, b) ((a) > (b) ? (a) : (b))
2620

27-
2821
// backend buffer type
2922

3023
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
@@ -2107,6 +2100,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
21072100
return true;
21082101
}
21092102

2103+
#include "llamafile/log.h"
2104+
21102105
GGML_CALL static void system_exit(int rc) {
21112106
exit(rc);
21122107
}

0 commit comments

Comments
 (0)